!mkdir data && wget -P data/ https://raw.githubusercontent.com/cauliyang/Visualizing-Data-with-Seaborn/main/data/Crimes_One_year_prior_to_present_first_1001.csv
Setting Up the Environment
import numpy as npimport pandas as pdimport seaborn as snsimport matplotlib.pyplot as plt# from matplotlib import rcParams# # Set global font properties to Arial# rcParams.update(# {# "font.family": "sans-serif",# "font.sans-serif": "Arial",# "pdf.fonttype": 42, # Embed fonts as Type 3 fonts for compatibility# "ps.fonttype": 42,# "text.usetex": False,# "svg.fonttype": "none",# }# )def stardize_columns(df): df.columns = [" ".join(col.strip().split()) for col in df.columns]# Basic data cleaning df["DATE OF OCCURRENCE"] = pd.to_datetime(df["DATE OF OCCURRENCE"])# Load the datadf = pd.read_csv("data/Crimes_One_year_prior_to_present_first_1001.csv")stardize_columns(df)
Understanding the Dataset
# Display basic information about the datasetprint(df.info())
from wordcloud import WordCloudimport matplotlib.pyplot as plt# Combine all secondary descriptions into a single stringtext =" ".join(df["SECONDARY DESCRIPTION"].dropna())# Create and generate a word cloud imagewordcloud = WordCloud( width=800, height=400, background_color="white", min_font_size=10).generate(text)# Display the generated imageplt.figure(figsize=(12, 8))plt.imshow(wordcloud, interpolation="bilinear")plt.axis("off")plt.show()
Further Exploration
Categorical Plots: Box Plot
df["DAY_OF_WEEK"] = df["DATE OF OCCURRENCE"].dt.day_name()plt.figure(figsize=(12, 5))sns.boxplot(data=df, x="DAY_OF_WEEK", y="DATE OF OCCURRENCE").set_ylabel("Date")plt.title("Distribution of Crimes by Day of the Week")plt.show()
Categorical Plots: Violin Plot
plt.figure(figsize=(12, 5))sns.violinplot(data=df, x="DAY_OF_WEEK", y="DATE OF OCCURRENCE").set_ylabel("Date")plt.show()
Categorical Plots: Box-and-Whisker Plot
plt.figure(figsize=(12, 5))sns.boxenplot(data=df, x="DAY_OF_WEEK", y="DATE OF OCCURRENCE").set_ylabel("Date")plt.show()
Distribution Plots: Histogram
df["HOUR"] = df["DATE OF OCCURRENCE"].dt.hourplt.figure(figsize=(12, 5))sns.histplot(data=df, x="HOUR", bins=24, kde=True)plt.title("Distribution of Crimes by Hour of the Day")plt.show()
Distribution Plots: KDE Plot
plt.figure(figsize=(12, 6))sns.kdeplot(data=df, x="HOUR", hue="PRIMARY DESCRIPTION", common_norm=False)plt.title("Distribution of Different Crime Types by Hour")plt.show()
Relational Plots: Scatter Plot
plt.figure(figsize=(12, 8))sns.scatterplot(data=df, x="LONGITUDE", y="LATITUDE", hue="PRIMARY DESCRIPTION")plt.title("Geographical Distribution of Crimes")plt.show()
Relational Plots: Scatter Plot
Relational Plots: Line Plot
crime_counts = df.groupby("DATE OF OCCURRENCE").size().reset_index(name="COUNT")plt.figure(figsize=(10, 5))sns.lineplot(data=crime_counts, x="DATE OF OCCURRENCE", y="COUNT")plt.title("Crime Trends Over Time")plt.xticks(rotation=45)plt.show()
Advanced Customization
plt.figure(figsize=(14, 6))sns.set_style("whitegrid")sns.set_palette("deep")g = sns.countplot( data=df, y="PRIMARY DESCRIPTION", order=df["PRIMARY DESCRIPTION"].value_counts().index[:10],)g.set_title("Top 10 Crime Types", fontsize=20)g.set_xlabel("Count", fontsize=14)g.set_ylabel("Crime Type", fontsize=14)for i, v inenumerate(df["PRIMARY DESCRIPTION"].value_counts()[:10]): g.text(v +3, i, str(v), color="black", va="center")plt.tight_layout()plt.show()
Advanced Customization
Summary
Heatmap
Useful for visualizing correlation between variables
Can show patterns and relationships in complex datasets
# Select numeric columnsnumeric_cols = df.select_dtypes(include=[np.number]).columns# Compute correlation matrixcorr_matrix = df[numeric_cols].corr()# Create a mask for the upper trianglemask = np.triu(np.ones_like(corr_matrix, dtype=bool))# Set up the matplotlib figureplt.figure(figsize=(8, 8))# Create heatmap with only upper trianglesns.heatmap( corr_matrix, mask=mask, annot=True, cmap="coolwarm", vmin=-1, vmax=1, center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, fmt=".2f",)plt.xticks(rotation=45, ha="right")plt.show()
Customized Heatmap
Pair Plot
Useful for exploring relationships between multiple variables
Creates a grid of scatter plots for each pair of variables
Pair Plot
# Select relevant columns for the pair plotcols_to_plot = ["X COORDINATE", "Y COORDINATE", "LATITUDE", "LONGITUDE"]# Add hour of daydf["HOUR"] = pd.to_datetime(df["DATE OF OCCURRENCE"]).dt.hour# Create the pair plotplt.figure(figsize=(5, 5))pairplot = sns.pairplot( df[cols_to_plot + ["HOUR", "PRIMARY DESCRIPTION"]], hue="PRIMARY DESCRIPTION", palette="viridis", plot_kws={"alpha": 0.6}, diag_kind="kde",)plt.tight_layout()plt.show()
Pair Plot
<Figure size 480x480 with 0 Axes>
Regression Plot
Visualizes the relationship between two variables
Includes a linear regression line and confidence interval
sns.lmplot( data=df, x="BEAT", y="WARD", col="ARREST", row="DOMESTIC", height=3, aspect=2, facet_kws=dict(sharex=False, sharey=False), scatter_kws={"alpha": 0.5},)plt.title("Regression Plot: Latitude vs Longitude of Crime Occurrences")plt.show()
Regression Plot
Advanced Seaborn: FacetGrid
Demonstrates how to create multiple plots in a grid
Useful for comparing distributions across categories
# Create a FacetGridplt.figure(figsize=(4, 4))g = sns.FacetGrid(df, col="PRIMARY DESCRIPTION", col_wrap=3, height=4, aspect=1.5)# Map a histogram to each subplotg.map(plt.hist, "HOUR", bins=24)# Customize the plotg.set_axis_labels("Hour of Day", "Count")g.set_titles("{col_name}")g.fig.suptitle("Distribution of Crimes by Hour for Different Crime Types", y=1.02)g.tight_layout()plt.show()